library(tidyverse)
library(readr)
train <- read.csv("train.csv", stringsAsFactors = TRUE)
train <- train %>%
mutate(Alley = as.character(Alley),
Alley = replace_na(Alley, "None"),
Alley = as.factor(Alley)) %>%
mutate(TotalSF = X1stFlrSF + X2ndFlrSF + TotalBsmtSF,
RichNbrhd = case_when(Neighborhood %in% c("StoneBr", "NridgHt", "NoRidge") ~ 1,
TRUE ~ 0)) %>%
mutate(TotalBaths = FullBath + HalfBath/2 + BsmtFullBath + BsmtHalfBath/2)
First take a look at the pairs plot.
Looks like TotalSF, GrLiveArea, YearBuilt, YearRemodAdd, and GarageArea all look like they effect the SalesPrice. We can test the lm.
mylm = lm(SalePrice ~ TotalSF + GrLivArea + YearBuilt + YearRemodAdd + GarageArea +
TotalSF:GrLivArea + TotalSF:YearBuilt + TotalSF:YearRemodAdd,data = train)
summary(mylm)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + GrLivArea + YearBuilt + YearRemodAdd +
## GarageArea + TotalSF:GrLivArea + TotalSF:YearBuilt + TotalSF:YearRemodAdd,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -363670 -18979 -993 16389 365061
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.110e+06 3.805e+05 8.173 6.48e-16 ***
## TotalSF -2.028e+03 1.513e+02 -13.402 < 2e-16 ***
## GrLivArea 7.466e+01 5.169e+00 14.443 < 2e-16 ***
## YearBuilt -3.302e+02 1.441e+02 -2.292 0.0221 *
## YearRemodAdd -1.271e+03 2.092e+02 -6.076 1.57e-09 ***
## GarageArea 4.743e+01 6.190e+00 7.661 3.35e-14 ***
## TotalSF:GrLivArea -1.409e-02 1.059e-03 -13.308 < 2e-16 ***
## TotalSF:YearBuilt 2.744e-01 5.535e-02 4.957 8.01e-07 ***
## TotalSF:YearRemodAdd 7.813e-01 8.754e-02 8.924 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38470 on 1451 degrees of freedom
## Multiple R-squared: 0.7667, Adjusted R-squared: 0.7654
## F-statistic: 596.2 on 8 and 1451 DF, p-value: < 2.2e-16
The lm shows promise so some interactions where tested.
mylm = lm(SalePrice ~
TotalSF + GrLivArea + YearBuilt + YearRemodAdd + GarageArea +
TotalSF:(GrLivArea + YearBuilt + YearRemodAdd) +
GrLivArea:(YearBuilt + GarageArea) +
YearBuilt:(GarageArea) +
TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea
,data = train)
summary(mylm)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + GrLivArea + YearBuilt + YearRemodAdd +
## GarageArea + TotalSF:(GrLivArea + YearBuilt + YearRemodAdd) +
## GrLivArea:(YearBuilt + GarageArea) + YearBuilt:(GarageArea) +
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -406737 -16695 440 14573 253504
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 2.607e+06 3.589e+05
## TotalSF -2.037e+03 2.076e+02
## GrLivArea 6.383e+02 2.539e+02
## YearBuilt -4.719e+02 1.344e+02
## YearRemodAdd -8.032e+02 1.975e+02
## GarageArea -1.295e+03 3.317e+02
## TotalSF:GrLivArea 1.804e-02 2.516e-03
## TotalSF:YearBuilt 4.783e-01 1.101e-01
## TotalSF:YearRemodAdd 5.711e-01 8.452e-02
## GrLivArea:YearBuilt -3.613e-01 1.294e-01
## GrLivArea:GarageArea 1.840e-01 1.245e-02
## YearBuilt:GarageArea 6.041e-01 1.690e-01
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -7.204e-12 4.334e-13
## t value Pr(>|t|)
## (Intercept) 7.264 6.12e-13 ***
## TotalSF -9.812 < 2e-16 ***
## GrLivArea 2.514 0.012038 *
## YearBuilt -3.512 0.000459 ***
## YearRemodAdd -4.067 5.02e-05 ***
## GarageArea -3.903 9.92e-05 ***
## TotalSF:GrLivArea 7.171 1.18e-12 ***
## TotalSF:YearBuilt 4.345 1.49e-05 ***
## TotalSF:YearRemodAdd 6.757 2.03e-11 ***
## GrLivArea:YearBuilt -2.792 0.005304 **
## GrLivArea:GarageArea 14.774 < 2e-16 ***
## YearBuilt:GarageArea 3.575 0.000362 ***
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -16.621 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35030 on 1447 degrees of freedom
## Multiple R-squared: 0.8072, Adjusted R-squared: 0.8056
## F-statistic: 504.9 on 12 and 1447 DF, p-value: < 2.2e-16
This gives a better r squared, let start plotting the chart to find more variables.
ggplot(train, aes(y=SalePrice, x = TotalSF, color = RichNbrhd)) +
geom_point()
Rich Neighborhood seems to be a good indicator of sales price, when including it in the model we get:
mylm = lm(SalePrice ~
TotalSF + GrLivArea + YearBuilt + YearRemodAdd + GarageArea +
TotalSF:(YearRemodAdd) +
GrLivArea:(YearBuilt + GarageArea) +
YearBuilt:(GarageArea) +
TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea +
RichNbrhd + RichNbrhd:TotalSF
,data = train)
summary(mylm)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + GrLivArea + YearBuilt + YearRemodAdd +
## GarageArea + TotalSF:(YearRemodAdd) + GrLivArea:(YearBuilt +
## GarageArea) + YearBuilt:(GarageArea) + TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea +
## RichNbrhd + RichNbrhd:TotalSF, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -213945 -15062 -82 13542 171844
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 7.905e+05 3.377e+05
## TotalSF -1.055e+03 1.363e+02
## GrLivArea 2.467e+02 1.219e+02
## YearBuilt 3.675e+02 1.051e+02
## YearRemodAdd -7.425e+02 1.648e+02
## GarageArea -9.082e+02 2.798e+02
## RichNbrhd -1.770e+05 1.319e+04
## TotalSF:YearRemodAdd 5.549e-01 6.883e-02
## GrLivArea:YearBuilt -1.295e-01 6.227e-02
## GrLivArea:GarageArea 1.033e-01 1.080e-02
## YearBuilt:GarageArea 4.347e-01 1.422e-01
## TotalSF:RichNbrhd 6.312e+01 3.718e+00
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -3.348e-12 2.040e-13
## t value Pr(>|t|)
## (Intercept) 2.341 0.019385 *
## TotalSF -7.736 1.90e-14 ***
## GrLivArea 2.024 0.043183 *
## YearBuilt 3.497 0.000484 ***
## YearRemodAdd -4.505 7.18e-06 ***
## GarageArea -3.246 0.001198 **
## RichNbrhd -13.413 < 2e-16 ***
## TotalSF:YearRemodAdd 8.062 1.56e-15 ***
## GrLivArea:YearBuilt -2.079 0.037749 *
## GrLivArea:GarageArea 9.559 < 2e-16 ***
## YearBuilt:GarageArea 3.057 0.002277 **
## TotalSF:RichNbrhd 16.976 < 2e-16 ***
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -16.416 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31520 on 1447 degrees of freedom
## Multiple R-squared: 0.8438, Adjusted R-squared: 0.8425
## F-statistic: 651.5 on 12 and 1447 DF, p-value: < 2.2e-16
This helped to increase the rsquared of the lm. Next I want to look at the baths in the house.
ggplot(train, aes(y=SalePrice, x = TotalSF, color = interaction(as.factor(TotalBaths)))) +
geom_point() +
facet_wrap(~interaction(as.factor(TotalBaths)))
Besides the 4.5 bath houses it seems that there is a relationship between the number of baths and the price of the house. Adding Total Baths to the lm we get:
mylm = lm(SalePrice ~
TotalSF + GrLivArea + YearBuilt + YearRemodAdd + GarageArea +
TotalSF:(YearRemodAdd) +
GrLivArea:(YearBuilt + GarageArea) +
YearBuilt:(GarageArea) +
TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea +
RichNbrhd + RichNbrhd:TotalSF +
TotalBaths + RichNbrhd:TotalBaths
,data = train)
summary(mylm)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + GrLivArea + YearBuilt + YearRemodAdd +
## GarageArea + TotalSF:(YearRemodAdd) + GrLivArea:(YearBuilt +
## GarageArea) + YearBuilt:(GarageArea) + TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea +
## RichNbrhd + RichNbrhd:TotalSF + TotalBaths + RichNbrhd:TotalBaths,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -210966 -14860 -234 12922 174373
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 1.169e+06 3.383e+05
## TotalSF -1.160e+03 1.346e+02
## GrLivArea 2.929e+02 1.201e+02
## YearBuilt 3.120e+02 1.047e+02
## YearRemodAdd -8.829e+02 1.631e+02
## GarageArea -9.708e+02 2.749e+02
## RichNbrhd -2.147e+05 1.560e+04
## TotalBaths 7.763e+03 1.585e+03
## TotalSF:YearRemodAdd 6.076e-01 6.797e-02
## GrLivArea:YearBuilt -1.553e-01 6.134e-02
## GrLivArea:GarageArea 9.746e-02 1.064e-02
## YearBuilt:GarageArea 4.701e-01 1.397e-01
## TotalSF:RichNbrhd 5.654e+01 4.076e+00
## RichNbrhd:TotalBaths 2.110e+04 5.185e+03
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -3.258e-12 2.007e-13
## t value Pr(>|t|)
## (Intercept) 3.455 0.000565 ***
## TotalSF -8.612 < 2e-16 ***
## GrLivArea 2.440 0.014819 *
## YearBuilt 2.980 0.002930 **
## YearRemodAdd -5.414 7.20e-08 ***
## GarageArea -3.531 0.000426 ***
## RichNbrhd -13.766 < 2e-16 ***
## TotalBaths 4.898 1.08e-06 ***
## TotalSF:YearRemodAdd 8.939 < 2e-16 ***
## GrLivArea:YearBuilt -2.533 0.011424 *
## GrLivArea:GarageArea 9.160 < 2e-16 ***
## YearBuilt:GarageArea 3.364 0.000788 ***
## TotalSF:RichNbrhd 13.874 < 2e-16 ***
## RichNbrhd:TotalBaths 4.068 4.99e-05 ***
## TotalSF:GrLivArea:YearBuilt:YearRemodAdd:GarageArea -16.233 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30960 on 1445 degrees of freedom
## Multiple R-squared: 0.8496, Adjusted R-squared: 0.8481
## F-statistic: 583.1 on 14 and 1445 DF, p-value: < 2.2e-16
After reviewing the lm it seems like GrLivArea and TotalSF are linked because the living area will scale with squarefootage, so GRLivArea will be removed. YearBuilt and GarageArea would have a interpritaiton of “the average price increases by $0.47 for every unit increase to either GarageArea or YearBuilt. Such a interpritation doesn’t make sense and removing it keeps the lm significant.
mylm = lm(SalePrice ~
TotalSF + YearRemodAdd + GarageArea +
TotalSF:(YearBuilt) +
TotalSF:YearBuilt:YearRemodAdd:GarageArea +
RichNbrhd + RichNbrhd:TotalSF +
TotalBaths + RichNbrhd:TotalBaths
,data = train)
summary(mylm)
##
## Call:
## lm(formula = SalePrice ~ TotalSF + YearRemodAdd + GarageArea +
## TotalSF:(YearBuilt) + TotalSF:YearBuilt:YearRemodAdd:GarageArea +
## RichNbrhd + RichNbrhd:TotalSF + TotalBaths + RichNbrhd:TotalBaths,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -248033 -16369 -1957 12971 253335
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -1.189e+06 1.077e+05 -11.034
## TotalSF -1.363e+02 2.993e+01 -4.554
## YearRemodAdd 5.937e+02 5.482e+01 10.829
## GarageArea 1.255e+02 9.644e+00 13.011
## RichNbrhd -2.227e+05 1.665e+04 -13.381
## TotalBaths 1.044e+04 1.658e+03 6.296
## TotalSF:YearBuilt 9.670e-02 1.549e-02 6.245
## TotalSF:RichNbrhd 6.477e+01 4.400e+00 14.722
## RichNbrhd:TotalBaths 1.993e+04 5.681e+03 3.509
## TotalSF:YearRemodAdd:GarageArea:YearBuilt -7.451e-09 8.413e-10 -8.857
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## TotalSF 5.71e-06 ***
## YearRemodAdd < 2e-16 ***
## GarageArea < 2e-16 ***
## RichNbrhd < 2e-16 ***
## TotalBaths 4.03e-10 ***
## TotalSF:YearBuilt 5.57e-10 ***
## TotalSF:RichNbrhd < 2e-16 ***
## RichNbrhd:TotalBaths 0.000463 ***
## TotalSF:YearRemodAdd:GarageArea:YearBuilt < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34230 on 1450 degrees of freedom
## Multiple R-squared: 0.8155, Adjusted R-squared: 0.8144
## F-statistic: 712.3 on 9 and 1450 DF, p-value: < 2.2e-16
set.seed(122)
num_rows <- 1000 #1460 total
keep <- sample(1:nrow(train), num_rows)
mytrain <- train[keep, ]
mytest <- train[-keep, ]
# Compute R-squared for each validation
# Get y-hat for each model on new data.
yht <- predict(mylm, newdata=mytest)
# Compute y-bar
ybar <- mean(mytest$SalePrice) #Yi is given by Ynew from the new sample of data
# Compute SSTO
SSTO <- sum( (mytest$SalePrice - ybar)^2 )
# Compute SSE for each model using y - yhat
SSEt <- sum( (mytest$SalePrice - yht)^2 )
# Compute R-squared for each
rst <- 1 - SSEt/SSTO
# Compute adjusted R-squared for each
n <- length(mytest$SalePrice) #sample size
pt <- length(coef(mylm)) #num. parameters in model
rsta <- 1 - (n-1)/(n-pt)*SSEt/SSTO
my_output_table2 <- data.frame(Model = c("My House Price Model"), `Original R2` = c(summary(mylm)$r.squared), `Orig. Adj. R-squared` = c(summary(mylm)$adj.r.squared), `Validation R-squared` = c(rst), `Validation Adj. R^2` = c(rsta))
colnames(my_output_table2) <- c("Model", "Original $R^2$", "Original Adj. $R^2$", "Validation $R^2$", "Validation Adj. $R^2$")
knitr::kable(my_output_table2, escape=TRUE, digits=4)
| Model | Original \(R^2\) | Original Adj. \(R^2\) | Validation \(R^2\) | Validation Adj. \(R^2\) |
|---|---|---|---|---|
| My House Price Model | 0.8155 | 0.8144 | 0.8591 | 0.8562 |
The validation shows that the model works well.
My final model is: \[ \begin{aligned} \hat{Y} =\ & \beta_0 + \beta_1 X_{\text{TotalSF}} + \beta_2 X_{\text{YearRemodAdd}} + \beta_3 X_{\text{GarageArea}} \\ &+ \beta_4 (X_{\text{TotalSF}} \cdot X_{\text{YearBuilt}}) \\ &+ \beta_5 (X_{\text{TotalSF}} \cdot X_{\text{YearBuilt}} \cdot X_{\text{YearRemodAdd}} \cdot X_{\text{GarageArea}}) \\ & + \beta_6 X_{\text{RichNbrhd}} + \beta_7 (X_{\text{RichNbrhd}} \cdot X_{\text{TotalSF}}) \\ &+ \beta_8 X_{\text{TotalBaths}} + \beta_{9} (X_{\text{RichNbrhd}} \cdot X_{\text{TotalBaths}}) \end{aligned} \]
These can be interpreted in the following ways:
The following plot uses the averages of YearRemodAdd, GarageArea, RichNbrhd, and TotalBaths, all rounded to the nearest whole number.
## Hint: library(car) has a scatterplot 3d function which is simple to use
# but the code should only be run in your console, not knit.
library(plotly)
library(reshape2)
library(car)
## scatter3d(Y ~ X1 + X2, data=yourdata)
## To embed the 3d-scatterplot inside of your html document is harder.
#library(plotly)
#library(reshape2)
#Perform the multiple regression
#Graph Resolution (more important for more complex shapes)
graph_reso <- 100
#Setup Axis
axis_x <- seq(min(train$TotalSF), max(train$TotalSF), by = graph_reso)
axis_y <- seq(min(train$YearBuilt), max(train$YearBuilt), by = 1)
#Sample points
price_surface <- expand.grid(TotalSF = axis_x, YearBuilt = axis_y, KEEP.OUT.ATTRS=F) %>%
mutate(YearRemodAdd = 1985,
GarageArea = 473,
RichNbrhd = 0,
TotalBaths = 2)
price_surface$Z <- predict.lm(mylm, newdata = price_surface)
price_surface <- acast(price_surface, YearBuilt ~ TotalSF, value.var = "Z") #y ~ x
#Create scatterplot
plot_ly(train,
x = ~TotalSF,
y = ~YearBuilt,
z = ~SalePrice,
type = "scatter3d",
mode = "markers") %>%
add_trace(z = price_surface,
x = axis_x,
y = axis_y,
type = "surface")
The following is a plot using the maximum of YearRemodAdd, GarageArea, RichNbrhd, and TotalBaths.
#Sample points
price_surface <- expand.grid(TotalSF = axis_x, YearBuilt = axis_y, KEEP.OUT.ATTRS=F) %>%
mutate(YearRemodAdd = max(train$YearRemodAdd),
GarageArea = max(train$GarageArea),
RichNbrhd = max(train$RichNbrhd),
TotalBaths = max(train$TotalBaths))
price_surface$Z <- predict.lm(mylm, newdata = price_surface)
price_surface <- acast(price_surface, YearBuilt ~ TotalSF, value.var = "Z") #y ~ x
#Create scatterplot
plot_ly(train,
x = ~TotalSF,
y = ~YearBuilt,
z = ~SalePrice,
type = "scatter3d",
mode = "markers") %>%
add_trace(z = price_surface,
x = axis_x,
y = axis_y,
type = "surface")
The following is a plot using the minimum of YearRemodAdd, GarageArea, RichNbrhd, and TotalBaths.
#Sample points
price_surface <- expand.grid(TotalSF = axis_x, YearBuilt = axis_y, KEEP.OUT.ATTRS=F) %>%
mutate(YearRemodAdd = min(train$YearRemodAdd),
GarageArea = min(train$GarageArea),
RichNbrhd = min(train$RichNbrhd),
TotalBaths = min(train$TotalBaths))
price_surface$Z <- predict.lm(mylm, newdata = price_surface)
price_surface <- acast(price_surface, YearBuilt ~ TotalSF, value.var = "Z") #y ~ x
#Create scatterplot
plot_ly(train,
x = ~TotalSF,
y = ~YearBuilt,
z = ~SalePrice,
type = "scatter3d",
mode = "markers") %>%
add_trace(z = price_surface,
x = axis_x,
y = axis_y,
type = "surface")